# Load the dataset
scrna.data <- Read10X(data.dir = here::here(
    '..','..','scrnaseq_tcsl154',
    'cellranger','outs','raw_feature_bc_matrix'))

#rename some adt features
hto_rows <- paste('Hashtag',1:12,sep='_')

adt_feature_names <- fread(here::here(
    '..','..','scrnaseq_tcsl154',
    'meta','adt_names.tsv'))
rownames(scrna.data$`Antibody Capture`) <- c(paste0(adt_feature_names$vis_name,'.adt'), hto_rows)


#rna features
scrna <- CreateSeuratObject(
    counts = scrna.data$`Gene Expression`, 
    project = "tcsl154", 
    min.cells = 3,
    min.features = 200)

adt_rows <- setdiff(rownames(scrna.data$`Antibody Capture`), hto_rows)

#adt features
scrna[['ADT']] <- CreateAssayObject(
    counts = scrna.data$`Antibody Capture`[adt_rows, colnames(scrna)])

RNA QC metrics

scrna[["percent.mt"]] <- PercentageFeatureSet(scrna, pattern = "^MT-")

VlnPlot(scrna, 
  pt.size=F, 
  features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

plot1 <- FeatureScatter(
  scrna, feature1 = "nCount_RNA", feature2 = "percent.mt")

plot2 <- FeatureScatter(
  scrna, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")

plot1 + plot2

scrna <- subset(
  scrna, 
  subset = nFeature_RNA > 800 & nFeature_RNA < 7000 & percent.mt < 15)

HTO QC Metrics, subsetting, clustering

source(here::here('r','scrna','HTOdemux.R'))

scrna[['HTO']] <- CreateAssayObject(
  counts = scrna.data$`Antibody Capture`[hto_rows, colnames(scrna)])

#all data loaded, clear raw data from memory and do garbage collect
#rm('scrna.data')
gc()

scrna <- NormalizeData(scrna, assay = "HTO", normalization.method = "CLR")

scrna <- DoubleHTODemux(scrna, positive.quantile = 0.90, ncenters=25)

ggplot(
    data.table(scrna@meta.data)[, .N, 
      by=.(HTO_classification, HTO_clusters, hash.ID)],
    aes(y=HTO_classification, x=hash.ID, fill=log10(N))) + 
  geom_tile() + facet_wrap(~HTO_clusters, scales='free') +
  labs(title='Cluster vs HTO assignment')

# assign match for best cluster
cluster.hash.ID <- data.table(scrna@meta.data)[, 
  .N, by=.(HTO_classification, HTO_clusters, hash.ID)][, 
    list(cluster.hash.ID= hash.ID[hash.ID != 'Triplets'][
      which.max(N[hash.ID != 'Triplets'])]), by=.(HTO_clusters)]

updated_metadata <- cluster.hash.ID[
  data.table(scrna@meta.data), 
    on='HTO_clusters', nomatch=NA, all=T][
    is.na(cluster.hash.ID), cluster.hash.ID := 'Triplet']

updated_metadata[, HTO_best_doublet := gsub('\\+\\d+','',HTO_classification)]
updated_metadata[, HTO_miscluster := (
  as.character(cluster.hash.ID) != as.character(HTO_best_doublet))]

# margin to throw away clustering points if third-closest HTO's
# signal strength relative to second is beyond this 
HTO_margin_cutoff <- -1

ggplot(updated_metadata[
    HTO_miscluster == F & 
    (as.character(hash.ID) != as.character(cluster.hash.ID))]) + 
  geom_jitter(aes(x=HTO_margin, y=HTO_classification), alpha=0.2) + 
  facet_wrap(~cluster.hash.ID, scales='free') + 
  geom_vline(aes(xintercept=-1))

# mark cells with HTO margin above cutoff as misclustered
updated_metadata[
  (as.character(hash.ID) != as.character(cluster.hash.ID)) & 
  HTO_margin > HTO_margin_cutoff, 
    HTO_miscluster := T]

# assign cluster hashes if not misclustered
updated_metadata[HTO_miscluster == F, hash.ID := cluster.hash.ID]

# merge with metadata file, add to HTO scrna metadata
HTO_sample_metadata <- fread(here::here(
    '..','..','scrnaseq_tcsl154','meta','sample_metadata.csv'))
names(HTO_sample_metadata)[2] <- 'hash.ID'
updated_metadata <- HTO_sample_metadata[updated_metadata, on='hash.ID']

#if hash combo is not a sample, label negative
updated_metadata[
  is.na(sample_num) & !(hash.ID %in% c('Triplets', 'Singlets', 'Negative')), 
  hash.ID := 'Negative']

# update metadata in object
updated_metadata <- as.data.frame(updated_metadata)
rownames(updated_metadata) <- colnames(scrna)
scrna@meta.data <- updated_metadata

#assign new cell identities
Idents(object = scrna) <- scrna@meta.data$hash.ID

#make a subset for tsne plot
hto_subset <- subset(scrna, cells=sample(Cells(scrna), 10000))

hto.dist.mtx <- as.matrix(parDist(t(
  GetAssayData(object = hto_subset, assay = "HTO"))))

hto_subset <- RunTSNE(hto_subset, 
  distance.matrix = hto.dist.mtx,
  perplexity = 100)

DimPlot(hto_subset, label=T)
DimPlot(hto_subset, split='hash.ID',ncol=4)

Analyzing RNA Data for TCSL 155

scrna.hashed <- subset(scrna, idents = HTO_sample_metadata$hash.ID)

DefaultAssay(scrna.hashed) <- "RNA"
scrna.hashed <- NormalizeData(scrna.hashed, 
    assay='RNA', 
    normalization.method = "LogNormalize", scale.factor = 10000)

scrna.hashed <- ScaleData(
  scrna.hashed, assay='RNA',
  features= VariableFeatures(scrna.hashed))
 
scrna.hashed <- FindVariableFeatures(scrna.hashed,
    selection.method = "vst",
    nfeatures = 2000)

saveRDS(scrna.hashed, file = here::here(
    '..','..','scrnaseq_tcsl154',
    'rds','scrna.hashed.rds'))

Separating CD4s and CD8s

First, get adt tags that separate CD4s and CD8s, and add rnas as controls.

if (!('scrna.hashed' %in% ls()))
  scrna.hashed <- readRDS(here::here(
    '..','..','scrnaseq_tcsl154',
    'rds','scrna.hashed.rds'))

DefaultAssay(scrna.hashed) <- "ADT"
scrna.hashed <- NormalizeData(scrna.hashed, assay = "ADT", normalization.method = "CLR")
## Normalizing across features
scrna.hashed <- ScaleData(scrna.hashed, assay = "ADT")
## Centering and scaling data matrix
scrna.hashed <- FindVariableFeatures(scrna.hashed)

adt_features <- scrna.hashed@assays$ADT@var.features

cd4v8_subset_vars <- c(adt_features, 'rna_CD8A','rna_CD8B','rna_CD4')

cd4v8.dt <- data.table(FetchData(
  object = scrna.hashed, 
  vars = cd4v8_subset_vars))

combined_pca <- prcomp(cd4v8.dt)

# calculate pca stats
pca.dt <- data.table(pc = data.table(colnames(combined_pca$rotation))[, 
    PC := as.integer(gsub("[A-Z]", "", V1))][, PC], 
    sd = combined_pca$sdev, 
    var = combined_pca$sdev^2, 
    var.norm = combined_pca$sdev^2/sum(combined_pca$sdev^2), 
    var.acc = cumsum(combined_pca$sdev^2/sum(combined_pca$sdev^2)))

melt.pca.dt <- melt(
  pca.dt, measure.vars = c("var.norm", "var.acc"), variable.name = "metric")

pca.stats.plot <- ggplot(melt.pca.dt) +
  geom_line(aes(x = pc, y = value, color = metric)) +
  geom_point(aes(x = pc, y = value, color = metric)) +
  scale_x_continuous(limits = c(1, NA)) +
  labs(title = "Fraction of Variance Captured by Principal Components, Every Sort Group", 
       x = "Principal Component", y = "Fraction of Variance") +
  theme_bw()

pca.stats.plot

# project data onto principal components
projected.metrics <- scale(cd4v8.dt, combined_pca$center, combined_pca$scale) %*% combined_pca$rotation

#vectorized switch:
names_4v8 <- c('4-8-', '8+4-', '4+8-', '4+8+')
categories_4v8 <- c('CD8','intermediate','intermediate','CD4')

projected.metrics <- cbind.data.frame(cd4v8.dt, projected.metrics)
projected.metrics[, is_CD4_adt := CD4.adt > 1.2]
projected.metrics[, is_CD8_adt := CD8A.adt > 0.5]
projected.metrics[, adt_state := names_4v8[1+(2*is_CD4_adt + is_CD8_adt)]]
projected.metrics[, is_CD4_rna := rna_CD4 > 0]
projected.metrics[, is_CD8B_rna := rna_CD8B > 0]
projected.metrics[, is_CD8A_rna := rna_CD8A > 0]
projected.metrics[, rna_state := names_4v8[1+(2*is_CD4_rna + (is_CD8B_rna | is_CD8A_rna))]]

intercepts <- c(.89, 1.95)
slopes <- c(.77, 1.45)

projected.metrics[, cd8_classify := PC2 > (slopes[1]*PC1 + intercepts[1])]
projected.metrics[, cd4_classify := PC2 > (slopes[2]*PC1 + intercepts[2])]
projected.metrics[, cd_category := categories_4v8[1+(2*cd4_classify + cd8_classify)]]

plot_grid(
  ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=adt_state)) + 
    geom_point(alpha=0.1) + 
    geom_abline(intercept=intercepts, slope=slopes),
  ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=adt_state)) + 
    facet_wrap(~adt_state) + 
    geom_point(alpha=0.1) + 
    geom_abline(intercept=intercepts, slope=slopes),
  ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=adt_state)) + 
    facet_wrap(~rna_state) + 
    geom_point(alpha=0.1) + 
    geom_abline(intercept=intercepts, slope=slopes),
  ggplot(projected.metrics[], aes(x = PC1, y = PC2, color=cd_category)) + 
    geom_point(alpha=0.1) + 
    geom_abline(intercept=intercepts, slope=slopes),
  ncol=1, 
  labels=c('', 'Grouped by presence of CD4/CD8 ADT', 'Grouped by presence of CD4/CD8 RNA','Manual Classification'), 
  scale=0.9)

scrna.hashed@meta.data[['CD4v8']] <- projected.metrics$cd_category

if (reload_data) saveRDS(scrna.hashed, file = here::here(
    '..','..','scrnaseq_tcsl154',
    'rds','scrna.hashed.rds'))

Make UMAP plots and objects for each subset

if (!('scrna.hashed' %in% ls()))
  scrna.hashed <- readRDS(here::here(
    '..','..','scrnaseq_tcsl154',
    'rds','scrna.hashed.rds'))

umap_plot <- function(subset, plot_title_text="", assay, cluster_resolution) {

    subset.markers <- FindAllMarkers(subset,
    features = VariableFeatures(subset),
    min.pct = 0.5, logfc.threshold = 0.25)
  
  top.subset.markers <- subset.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
  
  top_genes_plot <- ggplot(top.subset.markers) + 
    geom_bar(aes(y=avg_logFC, x=reorder(gene, avg_logFC)), stat='identity') + 
    facet_wrap(~cluster, scales='free', ncol=3) +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
  
  cluster_col_name <- paste0(assay,'_snn_res.',as.character(cluster_resolution))
  
  cluster_pct_plot <- ggplot(
      data.table(subset@meta.data)[
        CD4v8 != 'intermediate', .N, 
        by=c('car', cluster_col_name, 'CD4v8', 'k_type')][,
          list(cluster=get(cluster_col_name), t_type=CD4v8, frac=N/sum(N)), 
          by=c('car', 'CD4v8', 'k_type')]) + 
    geom_bar(aes(x=car, y=frac, fill=frac), stat='identity', color='grey50') +
    scale_fill_distiller(palette='YlGnBu', direction=1) +
    facet_grid(k_type+t_type~cluster) + 
    theme_bw() +
    coord_flip()
  
  plot_title <- ggdraw() + 
    draw_label(
      plot_title_text,
      fontface = 'bold',
      x = 0,
      hjust = 0
    ) +
    theme(
      # add margin on the left of the drawing canvas,
      # so title is aligned with left edge of first plot
      plot.margin = margin(0, 0, 0, 7)
    )
  
  gene_heatmap <- DoHeatmap(subset,
        features = unique(
          (top.subset.markers %>% group_by(cluster) 
            %>% top_n(n=5, wt= avg_logFC))$gene))
      
  umap_plots <- plot_grid(
    plot_grid(
      plot_title,
      DimPlot(subset, reduction = "umap", label=T),
      cluster_pct_plot,
      rel_heights=c(0.2,1.25,1),
      ncol=1),
    gene_heatmap,
    ncol=2)
  
  return(umap_plots)
}

umap_subset <- function(this_subset, 
    plot_title_text="", assay='RNA', cluster_resolution=0.8, save=T, rerun=F) {
  
  DefaultAssay(this_subset) <- assay
  this_subset <- subset(this_subset, 
    features=rownames(this_subset@assays[[assay]]))
  
  rds_file <- here::here(
    '..','..','scrnaseq_tcsl154',
    'rds',
    paste0(
        'scrna.hashed.',
        gsub(' ','.',plot_title_text),
        '.rds'))
  
  if (rerun || !file.exists(rds_file)) {
    
      if (assay == 'RNA') {
        this_subset <- NormalizeData(
            this_subset,
            normalization.method = "LogNormalize", 
            scale.factor = 10000)
      } else if (assay == 'ADT') {
          this_subset <- NormalizeData(
            this_subset,
            normalization.method = "CLR")
      }
      
      this_subset <- FindVariableFeatures(
          this_subset, 
          selection.method = "vst",
          nfeatures = 2000)
      
      this_subset <- ScaleData(
        this_subset, assay=assay,
        features= VariableFeatures(this_subset))
    
      this_subset <- RunPCA(
        this_subset, features = VariableFeatures(this_subset))
    
      this_subset <- FindNeighbors(this_subset, reduction = "pca", dims = 1:8)
      this_subset <- FindClusters(this_subset, resolution = cluster_resolution, algorithm=1)
      this_subset <- RunUMAP(this_subset, dims = 1:8)
    
      this_subset@meta.data[['car.ttype.ktype']] <- paste(
        this_subset@meta.data[['car']],
        this_subset@meta.data[['CD4v8']],
        this_subset@meta.data[['k_type']], sep='_')
  } else {
      this_subset <- readRDS(rds_file)
  }
  
  subset_umap_plot <- umap_plot(this_subset, plot_title_text, assay, cluster_resolution)
  
  if ((rerun | (!file.exists(rds_file))) & save)
    saveRDS(this_subset, file = rds_file)
      
  return(list(plots=subset_umap_plot, rds_file=rds_file))
}

subset_fxns= list(
    function(obj) obj,
    function(obj) subset(obj, subset = (CD4v8 == 'CD4')),
    function(obj) subset(obj, subset = (CD4v8 == 'CD8')),
    function(obj) subset(obj, subset = (k_type == 'cd19+')),
    function(obj) subset(obj, subset = (k_type == 'cd19+' & CD4v8 == 'CD4')),
    function(obj) subset(obj, subset = (k_type == 'cd19+' & CD4v8 == 'CD8')))
subset_names= c('all cells','CD4','CD8','CD19','CD4 CD19','CD8 CD19')

umap_plots <- data.table(subset_name= subset_names, subset_fxn_num=1:6)

umap_plots[, ADT := list(list(
    umap_subset(subset_fxns[[subset_fxn_num]](scrna.hashed), plot_title_text=paste0(.BY[1],' ADT'), assay='ADT'))), 
    by='subset_name']
umap_plots[, RNA := list(list(
    umap_subset(subset_fxns[[subset_fxn_num]](scrna.hashed), plot_title_text=paste0(.BY[1],' RNA'), assay='RNA'))), 
    by='subset_name']

saveRDS(umap_plots, file = here::here(
    '..','..','scrnaseq_tcsl154',
    'rds','scrna.umap.plots.rds'))

Show UMAP Plots

ADT plots

if (!('umap_plots' %in% ls()))
  umap_plots <- readRDS(here::here(
    '..','..','scrnaseq_tcsl154',
    'rds','scrna.umap.plots.rds'))

umap_plots[, print(ADT[[1]]), by='subset_name']
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.all.cells.ADT.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.ADT.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.ADT.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD19.ADT.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.CD19.ADT.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.CD19.ADT.rds"
if (!('scrna.hashed' %in% ls()))
  scrna.hashed <- readRDS(here::here(
    '..','..','scrnaseq_tcsl154',
    'rds','scrna.hashed.rds'))

RNA plots

umap_plots[, print(RNA[[1]]), by='subset_name']
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.all.cells.RNA.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.RNA.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.RNA.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD19.RNA.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD4.CD19.RNA.rds"
## 
## $plots

## 
## $rds_file
## [1] "/home/ec2-user/local-tcsl/src/../../scrnaseq_tcsl154/rds/scrna.hashed.CD8.CD19.RNA.rds"